<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ArXiv CS.CV Papers (Image/Video Generation) - May 03, 2025</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/framer-motion/10.16.4/framer-motion.dev.js"></script>
    <!-- Example using Font Awesome (replace with your preferred icon library if needed) -->
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css">
    <style>
        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

        :root {
            /* New Palette: Light, Clean, Futuristic with Teal/Aqua accents */
            --bg-color: #f8fafc; /* Tailwind slate-50 (Very Light Gray) */
            --card-bg-color: #ffffff; /* White */
            --text-color: #1e293b; /* Tailwind slate-800 (Dark Gray-Blue) */
            --text-muted-color: #64748b; /* Tailwind slate-500 (Medium Gray-Blue) */
            --header-color: #0f172a; /* Tailwind slate-900 (Very Dark Blue) */
            --highlight-primary: #14b8a6; /* Tailwind teal-500 */
            --highlight-secondary: #67e8f9; /* Tailwind cyan-300 */
            --border-color: #e2e8f0; /* Tailwind slate-200 (Light Gray) */
            --shadow-color: rgba(15, 23, 42, 0.08); /* Subtle shadow based on slate-900 */
        }

        body {
            background-color: var(--bg-color);
            color: var(--text-color);
            font-family: 'Inter', sans-serif;
            overflow-x: hidden; /* Prevent horizontal scroll */
            line-height: 1.6;
        }

        .bento-grid {
            display: grid;
            gap: 1.5rem; /* Tailwind gap-6 */
            grid-template-columns: 1fr; /* Force single column */
            padding-bottom: 4rem; /* Add padding at the bottom */
        }

        .bento-item {
            /* Apply semi-transparent white background and blur */
            background-color: rgba(255, 255, 255, 0.7); /* White with 70% opacity */
            backdrop-filter: blur(10px); /* Apply blur effect */
            -webkit-backdrop-filter: blur(10px); /* Safari prefix */
            border-radius: 1rem; /* Slightly larger radius */
            padding: 1.75rem; /* Slightly more padding */
            border: 1px solid rgba(226, 232, 240, 0.5); /* Lighter border with transparency */
            box-shadow: 0 4px 12px var(--shadow-color);
            transition: transform 0.3s ease-out, box-shadow 0.3s ease-out, background-color 0.3s ease-out;
            overflow: hidden; /* Ensure content doesn't overflow */
            position: relative; /* For potential pseudo-elements */
        }

        /* Removed ::before pseudo-element for a cleaner look */


        .bento-item:hover {
            transform: translateY(-6px);
            box-shadow: 0 10px 20px var(--shadow-color), 0 4px 8px rgba(15, 23, 42, 0.06); /* Adjusted hover shadow */
        }

        .paper-title {
            font-size: 1.125rem; /* Tailwind text-lg */
            font-weight: 600; /* Tailwind font-semibold */
            color: var(--highlight-primary); /* Use new primary highlight */
            margin-bottom: 0.75rem; /* Tailwind mb-3 */
            line-height: 1.4;
        }

        .paper-summary {
            font-size: 0.875rem; /* Tailwind text-sm */
            color: var(--text-muted-color);
            margin-bottom: 1.25rem; /* Tailwind mb-5 */
            line-height: 1.6;
        }

        .paper-link {
            display: inline-flex; /* Use flex for icon alignment */
            align-items: center;
            font-size: 0.875rem; /* Tailwind text-sm */
            font-weight: 600;
            color: var(--highlight-primary);
            text-decoration: none;
            padding: 0.5rem 1rem; /* Add padding */
            border-radius: 0.5rem; /* Slightly rounder */
            background-color: rgba(20, 184, 166, 0.08); /* Subtle teal background */
            border: 1px solid rgba(20, 184, 166, 0.2);
            transition: background-color 0.3s ease, color 0.3s ease, transform 0.2s ease;
        }

        .paper-link i {
            margin-right: 0.5rem; /* Tailwind mr-2 */
            transition: transform 0.3s ease;
        }

        .paper-link:hover {
            background-color: rgba(20, 184, 166, 0.15);
            color: #0d9488; /* Darker teal on hover */
            transform: translateY(-1px);
        }
        .paper-link:hover i {
             transform: translateX(2px);
        }

        .paper-authors {
            font-size: 0.75rem; /* Tailwind text-xs */
            color: var(--text-muted-color);
            margin-top: 1rem; /* Tailwind mt-4 */
            font-style: italic;
        }

        .header {
            text-align: center;
            margin-bottom: 3rem; /* Tailwind mb-12 */
            padding-top: 3rem; /* Tailwind pt-12 */
        }

        .header h1 {
            font-size: 2.5rem; /* Tailwind text-4xl or 5xl */
            font-weight: 700; /* Tailwind font-bold */
            color: var(--header-color);
            letter-spacing: -0.025em; /* Tailwind tracking-tight */
            margin-bottom: 0.5rem;
            /* Optional: Add a subtle text gradient */
            /* background: linear-gradient(90deg, var(--highlight-primary), var(--highlight-secondary)); */
            /* -webkit-background-clip: text; */
            /* -webkit-text-fill-color: transparent; */
        }

        .header p {
            font-size: 1.125rem; /* Tailwind text-lg */
            color: var(--text-muted-color);
            margin-top: 0.5rem; /* Tailwind mt-2 */
            max-width: 600px;
            margin-left: auto;
            margin-right: auto;
        }

        .footer {
            text-align: center;
            color: var(--text-muted-color);
            font-size: 0.875rem; /* Tailwind text-sm */
            padding-top: 2rem;
            padding-bottom: 2rem; /* Tailwind py-8 */
            border-top: 1px solid var(--border-color);
            margin-top: 4rem;
        }

        /* Simple line graphic element (optional) */
        .line-graphic {
            height: 1px; /* Thinner line */
            background: linear-gradient(90deg, rgba(20, 184, 166, 0), var(--highlight-primary), rgba(20, 184, 166, 0));
            opacity: 0.6;
            margin: 1.5rem 0; /* Adjust margin */
        }

        /* Framer Motion requires the script, styles enhance appearance */
        [data-motion-element] {
             /* Base styles for elements animated by Framer Motion */
        }

        .paper-tldr {
            font-size: 0.95rem; /* Slightly bigger than summary */
            color: #475569; /* Changed to Tailwind slate-600 (slightly darker than summary) */
            margin-top: 0.75rem; /* Tailwind mt-3 */
            margin-bottom: 0.75rem; /* Tailwind mb-2 */
            /* font-style: italic; */
            font-weight: bold;
        }

        .paper-rating {
            margin-top: 1rem; /* Tailwind mt-4 */
            margin-bottom: 1rem; /* Tailwind mb-4 */
            color: #f59e0b; /* Tailwind amber-500 */
        }

        .paper-rating i {
            margin-right: 0.125rem; /* Tailwind mr-0.5 */
        }

        /* Apply consistent star color to sub-ratings */
        .paper-sub-ratings .rating-item i {
            color: #f59e0b; /* Match overall rating star color (amber-500) */
            margin-right: 0.125rem; /* Consistent spacing */
        }

    </style>
</head>
<body class="container mx-auto px-4 antialiased">

    <motion.div
        initial="{ opacity: 0, y: -30 }"
        animate="{ opacity: 1, y: 0 }"
        transition="{ duration: 0.6, ease: 'easeOut' }"
        class="header"
        data-motion-element
    >
        <h1>AIGC Daily Papers</h1>
        <p>Daily papers related to Image/Video/Multimodal Generation from cs.CV</p>
        <p>May 03, 2025</p>
        <div class="line-graphic mt-4 mb-8 mx-auto w-1/4"></div> <!-- Added line graphic -->
    </motion.div>

    <div class="bento-grid" id="paper-grid">
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Any-to-Any Vision-Language Model for Multimodal X-ray Imaging and Radiological Report Generation</h2>
            <p class="paper-summary">Generative models have revolutionized Artificial Intelligence (AI),
particularly in multimodal applications. However, adapting these models to the
medical domain poses unique challenges due to the complexity of medical data
and the stringent need for clinical accuracy. In this work, we introduce a
framework specifically designed for multimodal medical data generation. By
enabling the generation of multi-view chest X-rays and their associated
clinical report, it bridges the gap between general-purpose vision-language
models and the specialized requirements of healthcare. Leveraging the MIMIC-CXR
dataset, the proposed framework shows superior performance in generating
high-fidelity images and semantically coherent reports. Our quantitative
evaluation reveals significant results in terms of FID and BLEU scores,
showcasing the quality of the generated data. Notably, our framework achieves
comparable or even superior performance compared to real data on downstream
disease classification tasks, underlining its potential as a tool for medical
research and diagnostics. This study highlights the importance of
domain-specific adaptations in enhancing the relevance and utility of
generative models for clinical applications, paving the way for future
advancements in synthetic multimodal medical data generation.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a vision-language model tailored for generating multi-view chest x-rays and corresponding clinical reports using the mimic-cxr dataset, demonstrating promising results in image fidelity and report coherence, even outperforming real data in disease classification.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种视觉-语言模型，专门用于生成多角度胸部x光片和相应的临床报告，该模型使用mimic-cxr数据集，在图像保真度和报告连贯性方面表现出良好的效果，甚至在疾病分类方面优于真实数据。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(10/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(9/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01091v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Daniele Molino, Francesco di Feola, Linlin Shen, Paolo Soda, Valerio Guarrasi</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">GENMO: A GENeralist Model for Human MOtion</h2>
            <p class="paper-summary">Human motion modeling traditionally separates motion generation and
estimation into distinct tasks with specialized models. Motion generation
models focus on creating diverse, realistic motions from inputs like text,
audio, or keyframes, while motion estimation models aim to reconstruct accurate
motion trajectories from observations like videos. Despite sharing underlying
representations of temporal dynamics and kinematics, this separation limits
knowledge transfer between tasks and requires maintaining separate models. We
present GENMO, a unified Generalist Model for Human Motion that bridges motion
estimation and generation in a single framework. Our key insight is to
reformulate motion estimation as constrained motion generation, where the
output motion must precisely satisfy observed conditioning signals. Leveraging
the synergy between regression and diffusion, GENMO achieves accurate global
motion estimation while enabling diverse motion generation. We also introduce
an estimation-guided training objective that exploits in-the-wild videos with
2D annotations and text descriptions to enhance generative diversity.
Furthermore, our novel architecture handles variable-length motions and mixed
multimodal conditions (text, audio, video) at different time intervals,
offering flexible control. This unified approach creates synergistic benefits:
generative priors improve estimated motions under challenging conditions like
occlusions, while diverse video data enhances generation capabilities.
Extensive experiments demonstrate GENMO's effectiveness as a generalist
framework that successfully handles multiple human motion tasks within a single
model.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces genmo, a generalist model for human motion that unifies motion generation and estimation tasks within a single framework, achieving state-of-the-art results by reformulating motion estimation as constrained generation and using a novel architecture for handling multimodal conditions.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了genmo，一个用于人体运动的通用模型，它将运动生成和估计任务统一在一个框架中，通过将运动估计重新表述为约束生成，并使用一种新颖的架构来处理多模态条件，从而实现了最先进的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01425v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jiefeng Li, Jinkun Cao, Haotian Zhang, Davis Rempe, Jan Kautz, Umar Iqbal, Ye Yuan</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">TSTMotion: Training-free Scene-awarenText-to-motion Generation</h2>
            <p class="paper-summary">Text-to-motion generation has recently garnered significant research
interest, primarily focusing on generating human motion sequences in blank
backgrounds. However, human motions commonly occur within diverse 3D scenes,
which has prompted exploration into scene-aware text-to-motion generation
methods. Yet, existing scene-aware methods often rely on large-scale
ground-truth motion sequences in diverse 3D scenes, which poses practical
challenges due to the expensive cost. To mitigate this challenge, we are the
first to propose a \textbf{T}raining-free \textbf{S}cene-aware
\textbf{T}ext-to-\textbf{Motion} framework, dubbed as \textbf{TSTMotion}, that
efficiently empowers pre-trained blank-background motion generators with the
scene-aware capability. Specifically, conditioned on the given 3D scene and
text description, we adopt foundation models together to reason, predict and
validate a scene-aware motion guidance. Then, the motion guidance is
incorporated into the blank-background motion generators with two
modifications, resulting in scene-aware text-driven motion sequences. Extensive
experiments demonstrate the efficacy and generalizability of our proposed
framework. We release our code in \href{https://tstmotion.github.io/}{Project
Page}.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces tstmotion, a training-free framework for generating scene-aware human motions from text descriptions by leveraging foundation models to guide pre-trained blank-background motion generators.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文提出了tstmotion，一个无需训练的框架，通过利用基础模型来引导预训练的空白背景运动生成器，从而从文本描述生成场景感知的人体运动。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01182v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Ziyan Guo, Haoxuan Qu, Hossein Rahmani, Dewen Soh, Ping Hu, Qiuhong Ke, Jun Liu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.15000000000000002, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">FreePCA: Integrating Consistency Information across Long-short Frames in Training-free Long Video Generation via Principal Component Analysis</h2>
            <p class="paper-summary">Long video generation involves generating extended videos using models
trained on short videos, suffering from distribution shifts due to varying
frame counts. It necessitates the use of local information from the original
short frames to enhance visual and motion quality, and global information from
the entire long frames to ensure appearance consistency. Existing training-free
methods struggle to effectively integrate the benefits of both, as appearance
and motion in videos are closely coupled, leading to motion inconsistency and
visual quality. In this paper, we reveal that global and local information can
be precisely decoupled into consistent appearance and motion intensity
information by applying Principal Component Analysis (PCA), allowing for
refined complementary integration of global consistency and local quality. With
this insight, we propose FreePCA, a training-free long video generation
paradigm based on PCA that simultaneously achieves high consistency and
quality. Concretely, we decouple consistent appearance and motion intensity
features by measuring cosine similarity in the principal component space.
Critically, we progressively integrate these features to preserve original
quality and ensure smooth transitions, while further enhancing consistency by
reusing the mean statistics of the initial noise. Experiments demonstrate that
FreePCA can be applied to various video diffusion models without requiring
training, leading to substantial improvements. Code is available at
https://github.com/JosephTiTan/FreePCA.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: freepca addresses the distribution shift problem in training-free long video generation by using principal component analysis (pca) to decouple and integrate appearance and motion information, leading to improved consistency and quality without retraining.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: freepca通过主成分分析(pca)解耦并整合外观和运动信息，解决了免训练长视频生成中的分布偏移问题，从而在无需重新训练的情况下提高了视频的连贯性和质量。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01172v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jiangtong Tan, Hu Yu, Jie Huang, Jie Xiao, Feng Zhao</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.2, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">VSC: Visual Search Compositional Text-to-Image Diffusion Model</h2>
            <p class="paper-summary">Text-to-image diffusion models have shown impressive capabilities in
generating realistic visuals from natural-language prompts, yet they often
struggle with accurately binding attributes to corresponding objects,
especially in prompts containing multiple attribute-object pairs. This
challenge primarily arises from the limitations of commonly used text encoders,
such as CLIP, which can fail to encode complex linguistic relationships and
modifiers effectively. Existing approaches have attempted to mitigate these
issues through attention map control during inference and the use of layout
information or fine-tuning during training, yet they face performance drops
with increased prompt complexity. In this work, we introduce a novel
compositional generation method that leverages pairwise image embeddings to
improve attribute-object binding. Our approach decomposes complex prompts into
sub-prompts, generates corresponding images, and computes visual prototypes
that fuse with text embeddings to enhance representation. By applying
segmentation-based localization training, we address cross-attention
misalignment, achieving improved accuracy in binding multiple attributes to
objects. Our approaches outperform existing compositional text-to-image
diffusion models on the benchmark T2I CompBench, achieving better image
quality, evaluated by humans, and emerging robustness under scaling number of
binding pairs in the prompt.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces a new compositional text-to-image diffusion model (vsc) that improves attribute-object binding in complex prompts by decomposing prompts, generating visual prototypes, and using segmentation-based localization training. it outperforms existing models on t2i compbench.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文提出了一种新的组合文本到图像扩散模型（vsc），通过分解提示、生成视觉原型和使用基于分割的定位训练，提高了复杂提示中属性-对象绑定的准确性。 该模型在t2i compbench上优于现有模型。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01104v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Do Huu Dat, Nam Hyeonu, Po-Yuan Mao, Tae-Hyun Oh</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.25, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Generating Animated Layouts as Structured Text Representations</h2>
            <p class="paper-summary">Despite the remarkable progress in text-to-video models, achieving precise
control over text elements and animated graphics remains a significant
challenge, especially in applications such as video advertisements. To address
this limitation, we introduce Animated Layout Generation, a novel approach to
extend static graphic layouts with temporal dynamics. We propose a Structured
Text Representation for fine-grained video control through hierarchical visual
elements. To demonstrate the effectiveness of our approach, we present VAKER
(Video Ad maKER), a text-to-video advertisement generation pipeline that
combines a three-stage generation process with Unstructured Text Reasoning for
seamless integration with LLMs. VAKER fully automates video advertisement
generation by incorporating dynamic layout trajectories for objects and
graphics across specific video frames. Through extensive evaluations, we
demonstrate that VAKER significantly outperforms existing methods in generating
video advertisements. Project Page:
https://yeonsangshin.github.io/projects/Vaker</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces animated layout generation and vaker, a text-to-video advertisement generation pipeline, which uses structured text representations to control dynamic layouts and outperforms existing methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了动画布局生成和 vaker (一个文本到视频广告生成流程)，它使用结构化的文本表示来控制动态布局，并且超越了现有方法。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.00975v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yeonsang Shin, Jihwan Kim, Yumin Song, Kyungseung Lee, Hyunhee Chung, Taeyoung Na</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.30000000000000004, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Improving Editability in Image Generation with Layer-wise Memory</h2>
            <p class="paper-summary">Most real-world image editing tasks require multiple sequential edits to
achieve desired results. Current editing approaches, primarily designed for
single-object modifications, struggle with sequential editing: especially with
maintaining previous edits along with adapting new objects naturally into the
existing content. These limitations significantly hinder complex editing
scenarios where multiple objects need to be modified while preserving their
contextual relationships. We address this fundamental challenge through two key
proposals: enabling rough mask inputs that preserve existing content while
naturally integrating new elements and supporting consistent editing across
multiple modifications. Our framework achieves this through layer-wise memory,
which stores latent representations and prompt embeddings from previous edits.
We propose Background Consistency Guidance that leverages memorized latents to
maintain scene coherence and Multi-Query Disentanglement in cross-attention
that ensures natural adaptation to existing content. To evaluate our method, we
present a new benchmark dataset incorporating semantic alignment metrics and
interactive editing scenarios. Through comprehensive experiments, we
demonstrate superior performance in iterative image editing tasks with minimal
user effort, requiring only rough masks while maintaining high-quality results
throughout multiple editing steps.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper proposes a layer-wise memory framework with background consistency guidance and multi-query disentanglement for improving the editability of image generation, especially in sequential editing tasks with multiple objects.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种分层记忆框架，包含背景一致性引导和多查询解耦，以提高图像生成的可编辑性，尤其是在涉及多个对象的顺序编辑任务中。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01079v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Daneul Kim, Jaeah Lee, Jaesik Park</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.35000000000000003, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">SpatialLLM: A Compound 3D-Informed Design towards Spatially-Intelligent Large Multimodal Models</h2>
            <p class="paper-summary">Humans naturally understand 3D spatial relationships, enabling complex
reasoning like predicting collisions of vehicles from different directions.
Current large multimodal models (LMMs), however, lack of this capability of 3D
spatial reasoning. This limitation stems from the scarcity of 3D training data
and the bias in current model designs toward 2D data. In this paper, we
systematically study the impact of 3D-informed data, architecture, and training
setups, introducing SpatialLLM, a large multi-modal model with advanced 3D
spatial reasoning abilities. To address data limitations, we develop two types
of 3D-informed training datasets: (1) 3D-informed probing data focused on
object's 3D location and orientation, and (2) 3D-informed conversation data for
complex spatial relationships. Notably, we are the first to curate VQA data
that incorporate 3D orientation relationships on real images. Furthermore, we
systematically integrate these two types of training data with the
architectural and training designs of LMMs, providing a roadmap for optimal
design aimed at achieving superior 3D reasoning capabilities. Our SpatialLLM
advances machines toward highly capable 3D-informed reasoning, surpassing
GPT-4o performance by 8.7%. Our systematic empirical design and the resulting
findings offer valuable insights for future research in this direction.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces spatialllm, a large multimodal model designed to enhance 3d spatial reasoning by using 3d-informed datasets and architectural/training designs, outperforming gpt-4o in certain benchmarks.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了spatialllm，一个大型多模态模型，通过使用3d信息数据集和架构/训练设计来增强3d空间推理能力，在特定基准测试中优于gpt-4o。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(4/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(6/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.00788v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Wufei Ma, Luoxin Ye, Nessa McWeeney, Celso M de Melo, Alan Yuille, Jieneng Chen</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.4, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Evaluating Vision Language Model Adaptations for Radiology Report Generation in Low-Resource Languages</h2>
            <p class="paper-summary">The integration of artificial intelligence in healthcare has opened new
horizons for improving medical diagnostics and patient care. However,
challenges persist in developing systems capable of generating accurate and
contextually relevant radiology reports, particularly in low-resource
languages. In this study, we present a comprehensive benchmark to evaluate the
performance of instruction-tuned Vision-Language Models (VLMs) in the
specialized task of radiology report generation across three low-resource
languages: Italian, German, and Spanish. Employing the LLaVA architectural
framework, we conducted a systematic evaluation of pre-trained models utilizing
general datasets, domain-specific datasets, and low-resource language-specific
datasets. In light of the unavailability of models that possess prior knowledge
of both the medical domain and low-resource languages, we analyzed various
adaptations to determine the most effective approach for these contexts. The
results revealed that language-specific models substantially outperformed both
general and domain-specific models in generating radiology reports, emphasizing
the critical role of linguistic adaptation. Additionally, models fine-tuned
with medical terminology exhibited enhanced performance across all languages
compared to models with generic knowledge, highlighting the importance of
domain-specific training. We also explored the influence of the temperature
parameter on the coherence of report generation, providing insights for optimal
model settings. Our findings highlight the importance of tailored language and
domain-specific training for improving the quality and accuracy of radiological
reports in multilingual settings. This research not only advances our
understanding of VLMs adaptability in healthcare but also points to significant
avenues for future investigations into model tuning and language-specific
adaptations.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper benchmarks instruction-tuned vision-language models (vlms) using the llava framework for radiology report generation in low-resource languages, finding that language-specific and domain-specific fine-tuning significantly improves performance.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文使用 llava 框架，针对低资源语言的放射学报告生成，对指令微调的视觉-语言模型 (vlm) 进行了基准测试，发现特定语言和特定领域的微调显着提高了性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01096v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Marco Salmè, Rosa Sicilia, Paolo Soda, Valerio Guarrasi</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.45, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Multimodal Doctor-in-the-Loop: A Clinically-Guided Explainable Framework for Predicting Pathological Response in Non-Small Cell Lung Cancer</h2>
            <p class="paper-summary">This study proposes a novel approach combining Multimodal Deep Learning with
intrinsic eXplainable Artificial Intelligence techniques to predict
pathological response in non-small cell lung cancer patients undergoing
neoadjuvant therapy. Due to the limitations of existing radiomics and unimodal
deep learning approaches, we introduce an intermediate fusion strategy that
integrates imaging and clinical data, enabling efficient interaction between
data modalities. The proposed Multimodal Doctor-in-the-Loop method further
enhances clinical relevance by embedding clinicians' domain knowledge directly
into the training process, guiding the model's focus gradually from broader
lung regions to specific lesions. Results demonstrate improved predictive
accuracy and explainability, providing insights into optimal data integration
strategies for clinical applications.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper presents a multimodal deep learning framework with doctor-in-the-loop explainability for predicting pathological response in non-small cell lung cancer, integrating imaging and clinical data.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种多模态深度学习框架，通过医生参与的解释性方法来预测非小细胞肺癌的病理反应，集成了影像和临床数据。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01390v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Alice Natalina Caragliano, Claudia Tacconi, Carlo Greco, Lorenzo Nibid, Edy Ippolito, Michele Fiore, Giuseppe Perrone, Sara Ramella, Paolo Soda, Valerio Guarrasi</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.5, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Edge Detection based on Channel Attention and Inter-region Independence Test</h2>
            <p class="paper-summary">Existing edge detection methods often suffer from noise amplification and
excessive retention of non-salient details, limiting their applicability in
high-precision industrial scenarios. To address these challenges, we propose
CAM-EDIT, a novel framework that integrates Channel Attention Mechanism (CAM)
and Edge Detection via Independence Testing (EDIT). The CAM module adaptively
enhances discriminative edge features through multi-channel fusion, while the
EDIT module employs region-wise statistical independence analysis (using
Fisher's exact test and chi-square test) to suppress uncorrelated
noise.Extensive experiments on BSDS500 and NYUDv2 datasets demonstrate
state-of-the-art performance. Among the nine comparison algorithms, the
F-measure scores of CAM-EDIT are 0.635 and 0.460, representing improvements of
19.2\% to 26.5\% over traditional methods (Canny, CannySR), and better than the
latest learning based methods (TIP2020, MSCNGP). Noise robustness evaluations
further reveal a 2.2\% PSNR improvement under Gaussian noise compared to
baseline methods. Qualitative results exhibit cleaner edge maps with reduced
artifacts, demonstrating its potential for high-precision industrial
applications.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces cam-edit, a novel edge detection framework using channel attention and inter-region independence testing, achieving state-of-the-art performance on bsds500 and nyudv2 datasets, particularly robust to noise and suited for high-precision industrial applications.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了cam-edit，一种新颖的边缘检测框架，它使用通道注意力和区域间独立性测试，在bsds500和nyudv2数据集上实现了最先进的性能，尤其是在噪声方面表现出强大的鲁棒性，适用于高精度工业应用。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01040v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Ru-yu Yan, Da-Qing Zhang</p>
            
        </motion.div>
        
    </div>

    <footer class="footer">
        Generated on 2025-05-05 04:31:16 UTC. Powered by <a href="https://github.com/onion-liu" target="_blank">onion-liu</a>.
    </footer>

</body>
</html>